Introduction

We use and extend R’s naniar package to demonstrate how visualizations typically used with datasets outside of meta-analysis can be adapted to the realities of meta-analytic data (Tierney, 2019).

To demonstrate approaches to examining and diagnosing missingness, we use data on substance abuse interventions for adolescents (Tanner-Smith et al., 2016). These data comprise 328 distinct effects of substance abuse interventions on future substance use. These effects arise from 46 studies, and include information about estimated treatment effects and variances, as well as over three dozen variables describing how and where the intervention was implemented. While none of the effect estimates are missing, there is missingness on some 18 other variables of interest, including the sample composition of studies and the duration and intensity of the intervention.

load data and packages

We start by loading the necessary packages

library(naniar)
library(ggplot2)
library(visdat)
library(tidyverse)
library(gridExtra)
library(grid)
library(cowplot)
#adt_data<- readRDS("adt_data.rds")
adt_data <- readRDS("~/Documents/Columbia University/Measurement and Evaluation PhD/Northwestern/Missing Data/Meta_Analysis_MissingData/data/adt_data.RDS")

Looking at missing data with visdat

The package visdat allows visualizing the entire data frame at once.

#Visualize whole dataframe at once
vis_dat(adt_data)

#summary of whether the data is missing or not
vis_miss(adt_data)

The first plot presents a summary of our dataset structure while highlighting variables with missing data; we can initially identify 18 variables of interest with some missingness. Further, we get a better idea of how much data is missing by looking at the second plot (6.2%).

Looking at missing data with naniar

Naniar package also provides an approach to visualizing overall missingness of the dataset.

For example, we can identify variables that present a greater number/percentage of missing cases

#summary of missing variables
p1<- gg_miss_var(adt_data) + labs(y="Look at all the missing ones")

#summary of missing variables (percentage)
p2<- gg_miss_var(adt_data, show_pct=TRUE)

grid.arrange(p1, p2, ncol = 2)

We can also look at the number of missing values in each case.

#missingness in cases. Number of missing values in each case
gg_miss_case(adt_data, order_cases=TRUE) + labs(x="Number of Cases")

Explore Missingness across factors

Explore the missingness in cases over some factor variable. For example, let’s look at missingness over The Level of Care for each group.

#Level of care group 1
gg_miss_case(adt_data, facet=g1loc)

#Level of care group 2
gg_miss_case(adt_data, facet=g2loc)
## Warning: Factor `g2loc` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `g2loc` contains implicit NA, consider using
## `forcats::fct_explicit_na`

We then look at the number of missings in each column, broken down by a factor variable, in this case the Level of Care for group 2.

#Level of care group 2
gg_miss_fct(x=adt_data, fct=g2loc)

For instance, in this case, we can identify the Inpatient category having 100% missing values in at least 12 variables.

Combinations of missingness across cases

Further, we can create an upset plot to examine the combinations of missingness across cases. We set the number of sets to 11, which is the number of variables that have at least 5 missing points.

gg_miss_upset(adt_data, nsets=11, nintersects=NA)

Explore missingness relationships with naniar

We can now explore variables that have larger percentage of missing data and their relationship with Effect Size g and Standard Error g

Start with Treatment Contact (hours per week) for group 1 and 2

#Point size to be 1/SE
psize=1/adt_data$se_g

#Treatment contact for group 1
#ES/SE vs Group 1 Treatment contact (Hours per week)
p3<- ggplot(adt_data, 
       aes(x= es_g , 
           y=g1hrsperweek)) + 
  geom_miss_point(size=psize) 
p3

#p4<- ggplot(adt_data, 
       #aes(x= se_g , 
           #y=g1hrsperweek)) + 
  #geom_miss_point() 

#grid.arrange(p3 + labs(x="Effect Size (Hedges' g)", y= "Group 1 Treatment contact (Hours per week)"), p4 + labs(x="Effect Size Standard Error",y= "Group 1 Treatment contact (Hours per week)" ), nrow=1)


#Treatment contact for group 1 include SE bar
#ES/SE vs Group 1 Treatment contact (Hours per week)
#ggplot(adt_data, 
       #aes(x= es_g , 
          # y=g1hrsperweek)) + 
  #geom_errorbarh(aes(xmax = es_g-se_g, xmin = es_g+se_g, height = .2))+
  #geom_miss_point() 


#Treatment contact for group 2
#ES/SE vs Group 2 Treatment contact (Hours per week)
p5<- ggplot(adt_data, 
       aes(x= es_g , 
           y=g2hrsperweek)) + 
  geom_miss_point(size=psize) 
p5

#p6<- ggplot(adt_data, 
       #aes(x= se_g , 
          #y=g2hrsperweek)) + 
  #geom_miss_point() 

#prow<- plot_grid(p5 + labs(x="Effect Size (Hedges' g)", y= "Group 2 Treatment contact (Hours per week)") + theme(legend.position = "none"), p6 + labs(x="Effect Size Standard Error",y= "Group 2 Treatment contact (Hours per week)" ) + theme(legend.position = "none"), nrow=1)

#legend<- get_legend(p5 + theme(legend.position = "bottom"))
#plot_grid(prow, legend, ncol=1, rel_heights=c(1, .2))

Duration of Treatment (days) for group 1 and 2

#Treatment Duration for group 1 and 2
#ES/SE vs Group 1 Duration of Treatment (Days)
p7<- ggplot(adt_data, 
       aes(x= es_g , 
           y=g1txdays)) + 
  geom_miss_point(size=psize) 
p7

#p8<- ggplot(adt_data, 
      #aes(x= se_g , 
           #y=g1txdays)) + 
  #geom_miss_point() 

#grid.arrange(p7 + labs(x="Effect Size (Hedges' g)", y= "Group 1 Duration of Treatment (Days)"), p8 + labs(x="Effect Size Standard Error",y= "Group 1 Duration of Treatment (Days)" ), nrow=1)


#ES/SE vs Group 2 Duration of Treatment (Days)
p9<- ggplot(adt_data, 
       aes(x= es_g , 
           y=g2txdays)) + 
  geom_miss_point(size=psize)
p9

#p10<- ggplot(adt_data, 
       #aes(x= se_g , 
           #y=g2txdays)) + 
  #geom_miss_point() 

#grid.arrange(p9 + labs(x="Effect Size (Hedges' g)", y= "Group 2 Duration of Treatment (Days)"), p10 + labs(x="Effect Size Standard Error",y= "Group 2 Duration of Treatment (Days)" ), nrow=1)

Frequency of treatment contact per week (number of sessions) for group 2

#Frequency of treatment contact per week (number of sessions)
#ES/SE vs group 2 number of sessions
p11<- ggplot(adt_data, 
       aes(x= es_g , 
           y=g2numsessions)) + 
  geom_miss_point(size=psize) 
p11

#p12<- ggplot(adt_data, 
       #aes(x= se_g , 
           #y=g2numsessions)) + 
  #geom_miss_point() 

#grid.arrange(p11 + labs(x="Effect Size (Hedges' g)", y= "Group 2 Number of Sessions"), p12 + labs(x="Effect Size Standard Error",y= "Group 2 Number of Sessions" ), nrow=1)

Percentage of black participants for group 1 and 2

#ES/SE vs group 1 black percentage
p13<- ggplot(adt_data, 
       aes(x= es_g , 
           y=g1perblack)) + 
  geom_miss_point(size=psize) 
p13

#p14<- ggplot(adt_data, 
       #aes(x= se_g , 
           #y=g1perblack)) + 
  #geom_miss_point() 

#grid.arrange(p13 + labs(x="Effect Size (Hedges' g)", y= "Group 1 Black Percentage"), p14 + labs(x="Effect Size Standard Error",y= "Group 1 Black Percentage" ), nrow=1)


#ES/SE vs group 2 black percentage
p15<- ggplot(adt_data, 
       aes(x= es_g , 
           y=g2perblack)) + 
  geom_miss_point(size=psize) 
p15

#p16<- ggplot(adt_data, 
       #aes(x= se_g , 
           #y=g2perblack)) + 
  #geom_miss_point() 

#grid.arrange(p15 + labs(x="Effect Size (Hedges' g)", y= "Group 2 Black Percentage"), p16 + labs(x="Effect Size Standard Error",y= "Group 2 Black Percentage" ), nrow=1)

We could also examine these relationships by looking at the distribution of ES, plotting for values of ES when some covariates are missing, and when they are not.

First, we will use the shadow function to keep track of missing values. Create a dataframe with the same set of columns, but with the column names added a suffix_NA

as_shadow(adt_data)
## # A tibble: 328 x 46
##    pk_es_NA studyid_NA groupid1_NA groupid2_NA varid_NA estimingpost_NA
##    <fct>    <fct>      <fct>       <fct>       <fct>    <fct>          
##  1 !NA      !NA        !NA         !NA         !NA      !NA            
##  2 !NA      !NA        !NA         !NA         !NA      !NA            
##  3 !NA      !NA        !NA         !NA         !NA      !NA            
##  4 !NA      !NA        !NA         !NA         !NA      !NA            
##  5 !NA      !NA        !NA         !NA         !NA      !NA            
##  6 !NA      !NA        !NA         !NA         !NA      !NA            
##  7 !NA      !NA        !NA         !NA         !NA      !NA            
##  8 !NA      !NA        !NA         !NA         !NA      !NA            
##  9 !NA      !NA        !NA         !NA         !NA      !NA            
## 10 !NA      !NA        !NA         !NA         !NA      !NA            
## # ... with 318 more rows, and 40 more variables: es_g_NA <fct>,
## #   se_g_NA <fct>, authoryear_NA <fct>, state_NA <fct>,
## #   `_referencesummary_NA` <fct>, design_NA <fct>, g1txcat_NA <fct>,
## #   g1loc_NA <fct>, g1attrition_NA <fct>, g1permale_NA <fct>,
## #   g1perwhite_NA <fct>, g1pernonwhite_NA <fct>, g1perblack_NA <fct>,
## #   g1perhisp_NA <fct>, g1age_NA <fct>, g1txdays_NA <fct>,
## #   g1hrsperweek_NA <fct>, g1numsessions_NA <fct>, g1mon_NA <fct>,
## #   g1imp_NA <fct>, g1man_NA <fct>, g2txcat_NA <fct>, g2loc_NA <fct>,
## #   g2attrition_NA <fct>, g2permale_NA <fct>, g2perwhite_NA <fct>,
## #   g2pernonwhite_NA <fct>, g2perblack_NA <fct>, g2perhisp_NA <fct>,
## #   g2age_NA <fct>, g2txdays_NA <fct>, g2hrsperweek_NA <fct>,
## #   g2numsessions_NA <fct>, g2mon_NA <fct>, g2imp_NA <fct>,
## #   g2man_NA <fct>, dvmicro_NA <fct>, dvsucat_NA <fct>,
## #   estxobnpost_NA <fct>, esctobnpost_NA <fct>
#Attach a shadow to the current dataframe
adt_shadow <- bind_shadow(adt_data)

Start by looking at the distribution of ES, plotting for values of ES when Treatment Contact (Hours per week) is missing, and when it is not missing

#ES/SE vs Group 1 Treatment contact (Hours per week)
p17<- ggplot(adt_shadow,
       aes(x =  es_g,
           colour =g1hrsperweek_NA )) + 
  geom_density()

p18<- ggplot(adt_shadow,
       aes(x =se_g,
           colour =g1hrsperweek_NA )) + 
  geom_density()

prow<- plot_grid(p17 + labs(x="Effect Size (Hedges' g)", colour="Group 1 \n Treatment contact")+ theme(legend.position = "none") , p18 + labs(x="Effect Size Standard Error", colour="Group 1 \n Treatment contact") + theme(legend.position = "none") , nrow=1)

legend<- get_legend(p17 + theme(legend.position = "bottom"))
plot_grid(prow, legend, ncol=1, rel_heights=c(1, .2)) 

##Set a limit to zoom in 
p17<- ggplot(adt_shadow,
       aes(x =  es_g,
           colour =g1hrsperweek_NA )) + 
  geom_density() + ylim(0,8)

prow<- plot_grid(p17 + labs(x="Effect Size (Hedges' g)", colour="Group 1 \n Treatment contact")+ theme(legend.position = "none") , p18 + labs(x="Effect Size Standard Error", colour="Group 1 \n Treatment contact") + theme(legend.position = "none") , nrow=1)

legend<- get_legend(p17 + theme(legend.position = "bottom"))
plot_grid(prow, legend, ncol=1, rel_heights=c(1, .2)) 

#ES/SE vs Group 2 Treatment contact (Hours per week)
p19<- ggplot(adt_shadow,
       aes(x =  es_g,
           colour =g2hrsperweek_NA )) + 
  geom_density()

p20<- ggplot(adt_shadow,
       aes(x =se_g,
           colour =g2hrsperweek_NA )) + 
  geom_density()

prow<- plot_grid(p19 + labs(x="Effect Size (Hedges' g)", colour="Group 2 \n Treatment contact")+ theme(legend.position = "none") , p20 + labs(x="Effect Size Standard Error", colour="Group 2 \n Treatment contact") + theme(legend.position = "none") , nrow=1)

legend<- get_legend(p19 + theme(legend.position = "bottom"))
plot_grid(prow, legend, ncol=1, rel_heights=c(1, .2)) 

Now the distribution of ES, plotting for values of ES when Duration of Treatment (days) is missing, and when it is not missing

#ES/SE vs Group 1 Duration of Treatment (Days)
p21<- ggplot(adt_shadow,
       aes(x =  es_g,
           colour =g1txdays_NA )) + 
  geom_density() 

p22<- ggplot(adt_shadow,
       aes(x =se_g,
           colour =g1txdays_NA )) + 
  geom_density()

prow<- plot_grid(p21 + labs(x="Effect Size (Hedges' g)", colour="Group 1 \n Treatment Duration")+ theme(legend.position = "none") , p22 + labs(x="Effect Size Standard Error", colour="Group 1 \n Treatment Duration") + theme(legend.position = "none") , nrow=1)

legend<- get_legend(p21 + theme(legend.position = "bottom"))
plot_grid(prow, legend, ncol=1, rel_heights=c(1, .2)) 

##Set a limit to zoom in 
p22<- ggplot(adt_shadow,
       aes(x =se_g,
           colour =g1txdays_NA )) + 
  geom_density() + ylim(0,10)

prow<- plot_grid(p21 + labs(x="Effect Size (Hedges' g)", colour="Group 1 \n Treatment Duration")+ theme(legend.position = "none") , p22 + labs(x="Effect Size Standard Error", colour="Group 1 \n Treatment Duration") + theme(legend.position = "none") , nrow=1)

legend<- get_legend(p21 + theme(legend.position = "bottom"))
plot_grid(prow, legend, ncol=1, rel_heights=c(1, .2)) 

#ES/SE vs Group 2 Duration of Treatment (Days)
p23<- ggplot(adt_shadow,
       aes(x =  es_g,
           colour =g2txdays_NA )) + 
  geom_density()

p24<- ggplot(adt_shadow,
       aes(x =se_g,
           colour =g2txdays_NA )) + 
  geom_density()

prow<- plot_grid(p23 + labs(x="Effect Size (Hedges' g)", colour="Group 2 \n Treatment Duration")+ theme(legend.position = "none") , p24 + labs(x="Effect Size Standard Error", colour="Group 2 \n Treatment Duration") + theme(legend.position = "none") , nrow=1)

legend<- get_legend(p23 + theme(legend.position = "bottom"))
plot_grid(prow, legend, ncol=1, rel_heights=c(1, .2)) 

Distribution of ES, plotting for values of ES when Percentage Black is missing, and when it is not missing

#ES/SE vs Group 1 Percetange Black
p25<- ggplot(adt_shadow,
       aes(x =  es_g,
           colour =g1perblack_NA )) + 
  geom_density()

p26<- ggplot(adt_shadow,
       aes(x =se_g,
           colour =g1perblack_NA )) + 
  geom_density()

prow<- plot_grid(p25 + labs(x="Effect Size (Hedges' g)", colour="Group 1 \n Percentage Black")+ theme(legend.position = "none") , p26 + labs(x="Effect Size Standard Error", colour="Group 1 \n Percentage Black") + theme(legend.position = "none") , nrow=1)

legend<- get_legend(p25 + theme(legend.position = "bottom"))
plot_grid(prow, legend, ncol=1, rel_heights=c(1, .2)) 

#ES/SE vs Group 2 Percentage Black
p27<- ggplot(adt_shadow,
       aes(x =  es_g,
           colour =g2perblack_NA )) + 
  geom_density()

p28<- ggplot(adt_shadow,
       aes(x =se_g,
           colour =g2perblack_NA )) + 
  geom_density()

prow<- plot_grid(p27 + labs(x="Effect Size (Hedges' g)", colour="Group 2 \n Percentage Black")+ theme(legend.position = "none"), p28 + labs(x="Effect Size Standard Error", colour="Group 2 \n Percentage Black") + theme(legend.position = "none") , nrow=1)

legend<- get_legend(p27 + theme(legend.position = "bottom"))
plot_grid(prow, legend, ncol=1, rel_heights=c(1, .2)) 

Numerical summaries for missing data

Finally, we explore numerical summaries using this package

First, determine number, proportion, and percentage of missing and complete observations

#Total number of missing observations
n_miss(adt_data)
## [1] 936
#Total number of complete observations
n_complete(adt_data)
## [1] 14152
#Proportion missing
prop_miss(adt_data)
## [1] 0.06203606
#Proportion complete
prop_complete(adt_data)
## [1] 0.9379639
#Percentage missing
pct_miss(adt_data)
## [1] 6.203606
#Percentage complete
pct_complete(adt_data)
## [1] 93.79639

Second, determine percentage missing by variable and case (study)

#Percentage missing by variable
miss_var_summary(adt_data)
## # A tibble: 46 x 3
##    variable      n_miss pct_miss
##    <chr>          <int>    <dbl>
##  1 g2hrsperweek     152     46.3
##  2 g2txdays         117     35.7
##  3 g2numsessions    111     33.8
##  4 g2loc            109     33.2
##  5 g1hrsperweek      79     24.1
##  6 g2perblack        75     22.9
##  7 g1perblack        71     21.6
##  8 g1perhisp         56     17.1
##  9 g2perhisp         51     15.5
## 10 g1numsessions     35     10.7
## # ... with 36 more rows
#Percentage missing by case (Study)
miss_case_summary(adt_data)
## # A tibble: 328 x 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1    15     17     37.0
##  2    16     17     37.0
##  3   199     12     26.1
##  4   266     10     21.7
##  5   267     10     21.7
##  6   268     10     21.7
##  7   269     10     21.7
##  8   270     10     21.7
##  9   271     10     21.7
## 10   272     10     21.7
## # ... with 318 more rows